===== ===== ===== ===== ===== Capstone Project ===== ===== ===== ===== =====

install.packages("e1071",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'e1071' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'e1071'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\e1071\libs\x64\e1071.dll to D:
## \Documents\R\win-library\3.6\e1071\libs\x64\e1071.dll: Permission denied
## Warning: restored 'e1071'
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("ggplot2",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("corrplot",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'corrplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("ggcorrplot",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'ggcorrplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("klaR",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'klaR' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("cluster",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'cluster' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cluster'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\cluster\libs\x64\cluster.dll to D:
## \Documents\R\win-library\3.6\cluster\libs\x64\cluster.dll: Permission
## denied
## Warning: restored 'cluster'
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("fpc",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'fpc' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("class",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'class' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'class'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\class\libs\x64\class.dll to D:
## \Documents\R\win-library\3.6\class\libs\x64\class.dll: Permission denied
## Warning: restored 'class'
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("rpart",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'rpart' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'rpart'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\rpart\libs\x64\rpart.dll to D:
## \Documents\R\win-library\3.6\rpart\libs\x64\rpart.dll: Permission denied
## Warning: restored 'rpart'
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("cowplot",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'cowplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("randomForest",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'randomForest' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'randomForest'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying D:
## \Documents\R\win-library\3.6\00LOCK\randomForest\libs\x64\randomForest.dll
## to D:\Documents\R\win-library\3.6\randomForest\libs\x64\randomForest.dll:
## Permission denied
## Warning: restored 'randomForest'
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("rpart.plot",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'rpart.plot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("tree",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'tree' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'tree'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\tree\libs\x64\tree.dll to D:
## \Documents\R\win-library\3.6\tree\libs\x64\tree.dll: Permission denied
## Warning: restored 'tree'
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("glmnet",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'glmnet' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glmnet'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\glmnet\libs\x64\glmnet.dll to D:
## \Documents\R\win-library\3.6\glmnet\libs\x64\glmnet.dll: Permission denied
## Warning: restored 'glmnet'
## 
## The downloaded binary packages are in
##  C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18
library("tree")
library("rpart.plot")
## Loading required package: rpart
library("cowplot")
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
library("randomForest")
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library("rpart")
library("class")
library("fpc")
library("cluster")
library("plyr")
library("klaR")
## Loading required package: MASS
library("ggplot2")
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library("e1071")
library("corrplot")
## corrplot 0.84 loaded
library("ggcorrplot")
MAE <- function(actual, predicted){
   mean(abs(actual- predicted))
 }

RMSE <- function(actual, predicted){
    sqrt(mean((predicted-actual)^2))
  }

In this part we shall mostly look to clean our data, which includes: address missing/duplicate values, look for outliers, correct data types , fixing categorical variables, Distribution of variables, Low variance filter.

0.0 - Importing data

raw_data = read.csv2(file='C:\\Users\\Abdullah\\Desktop\\housePrices\\train.csv', header = T, sep = ",", dec = ".",stringsAsFactors = FALSE)
modified_data = raw_data ## To ensure we do not touch original data, we replicate into another df and use that as modified version
str(raw_data)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
head(raw_data)

As we can see, we have 81 variables in our data set; 1 ID and 1 salesPrice, 43 categorical and 36 quantitative. Data entries are either int or char data type.

1 - Inital Analysis

1.1 - Univarient Analysis

1.1.1 Looking at our Target variable: Sales Price

summary(raw_data$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
typeof(raw_data$SalePrice)
## [1] "integer"
myhist <- hist(raw_data$SalePrice)

multiplier <- myhist$counts / myhist$density
mydensity <- density(raw_data$SalePrice)
mydensity$y <- mydensity$y * multiplier[1]

plot(myhist, xlab = "Sales Price", main = "Histogram of Sales")
lines(mydensity)

We can see that the average sales price of a house is $181k, the distribution of which is skewed to the right. As we can see from the histogram, the right tail is longer and the mass of the concentration of data is to the left of the graph, suggesting a positive skew value. The peak also looks very sharp, assuming a high kurtosis value.

boxplot(raw_data$SalePrice)

We can see that there are many outlier in the SalePrice. In fact, we can measure knowing that outliers are classified as values over 3rd Quartile + 1.5*(IQR)

count = 0;
x = 1;
while (x < 1461) {
  if (raw_data$SalePrice[x] > 340000){
    count = count + 1
  }
  x = x + 1 
}
cat("Number of Sales that lie as outliers are: ", count)
## Number of Sales that lie as outliers are:  61

Now that we’ve seen that there are 61 outliers in the Sale Price, lets look into them.

outlier_sales <- subset(raw_data, raw_data$SalePrice > 340000)
outlier_sales

As we can see, there are missing values for many of the entries. Perhaps it is better that we look to address missing values in our other 80 variables.

1.1.2 Addressing Nas, missing values in our House Attributes

# Counting number of nulls in each col
x = 1
cat("---NULL COUNT---\n")
## ---NULL COUNT---
while (x<81){
  if(sum(is.na((raw_data[x]))>0)){
    cat("Number of nulls in ",(colnames(raw_data[x])), ": ")
    cat(sum(is.na(raw_data[x])), "\n")
  }
  x = x + 1 
}
## Number of nulls in  LotFrontage : 259 
## Number of nulls in  Alley : 1369 
## Number of nulls in  MasVnrType : 8 
## Number of nulls in  MasVnrArea : 8 
## Number of nulls in  BsmtQual : 37 
## Number of nulls in  BsmtCond : 37 
## Number of nulls in  BsmtExposure : 38 
## Number of nulls in  BsmtFinType1 : 37 
## Number of nulls in  BsmtFinType2 : 38 
## Number of nulls in  Electrical : 1 
## Number of nulls in  FireplaceQu : 690 
## Number of nulls in  GarageType : 81 
## Number of nulls in  GarageYrBlt : 81 
## Number of nulls in  GarageFinish : 81 
## Number of nulls in  GarageQual : 81 
## Number of nulls in  GarageCond : 81 
## Number of nulls in  PoolQC : 1453 
## Number of nulls in  Fence : 1179 
## Number of nulls in  MiscFeature : 1406

There are 19 variables with NA values. It is important to note that this might mean that the observation is missing or perhaps a NA means somthing itself. We must consult the data dictionary.

LotFrontage has 259 NA values. This is integer value repersenting Linear feet of street connected to property. Looking into the dataset, we see that the other observations have a value anywhere from 21-313. We can safetly assume that these 259 entries repersent a house that has 0 linear feet of street being connected to the property. Of course it could be a case of missin value, but it is possible that the homes do not have the property connecting to a steet, we take this assumptions and instead change the NAs to 0. As such, We will not be removing such observations

v = 1
while (v<1461){
  if (is.na(modified_data$LotFrontage[v])){
    modified_data$LotFrontage[v] = 0
  }
  v = v + 1
}

Alley has 1369 NAs, this high number suggests that the NAs must mean somthing rather than missing value. The dictionary shows us the NA repersents no alley Access. Rather than Na, lets change that to ‘None’ a bit more repersentative.

v = 1
while (v<1461){
  if (is.na(modified_data$Alley[v])){
    modified_data$Alley[v] = "None"
  }
  v = v + 1
}

MasVnrType and MasVnrArea both have 8 missing values. This is very suspeious. Could it be that they are of the same observations and due to another house feature? Let us check. Maybe our data dictionary can help us with this.

MasVnr <- subset(raw_data, is.na(raw_data$MasVnrType))
MasVnr$Area <- subset(raw_data, is.na(raw_data$MasVnrArea))
MasVnr

These both variables are Na in the same observation. What is weird is that MasVnrType has a None category, so that means that it isn’t a case that there was no masonry veneer, rather I would say that the Masonry data was not collected for these 8 observations. This would tell me that it might be good to remove the observations

modified_data <- subset(modified_data, !is.na(raw_data$MasVnrType)) # we only do it for when one attribute isnt na, because the other overlaps.

Next lets look at BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2. All these have 37-38 missing values. Lets see if they are overlapping observations again, and see if we can figure out a pattern.

Bsmt <- subset(raw_data, is.na(raw_data$BsmtExposure))
Bsmt

Here we can see that they all overlap and there is a pattern here. They are all basement, perhaps there is somthing odd about the basement. Looking in the data dictionary confirms our suspicions, NA repersents No basements for all of there variables. These are not missing values, and NA is a very meaningful entry. I don’t like the use of NA, I would rather use a more descriptive categorical name: NoB

v = 1
while (v<1453){
  if (is.na(modified_data$BsmtQual[v])){
    modified_data$BsmtQual[v] = "NoB"
  }
  if (is.na(modified_data$BsmtCond[v])){
    modified_data$BsmtCond[v] = "NoB"
  }
  if (is.na(modified_data$BsmtExposure[v])){
    modified_data$BsmtExposure[v] = "NoB"
  }
  if (is.na(modified_data$BsmtFinType1[v])){
    modified_data$BsmtFinType1[v] = "NoB"
  }
  if (is.na(modified_data$BsmtFinType2[v])){
    modified_data$BsmtFinType2[v] = "NoB"
  }
  v = v + 1
}
Bsmt1 <- subset(modified_data, modified_data$BsmtExposure == "NoB")
Bsmt1
Bsmt2 <- subset(modified_data, modified_data$BsmtFinType2 == "NoB")
Bsmt2

It’s important to note that observation 949 has a basement that is unfinished and the Exposure is set to NA. This is a potential missing value, only because we know that the other categorical variables label this observation as UNFINSHED rather than NO BASEMENT. The exposture could have been set at No exposure, but rather was set to a level repersenting no basement. I think it’s safe to remove this data observation.

Also observation 333 has a signular basement with FinType2 as NA. FinType1 does have a value, however, and after looking through other observations, this stikes as very odd. FinType repersents the finishing of the basement, FinType1 repersents the first layer, and FinType2 repersents any additional layers (if there are any), however in the event there is one layer, other entries would have FinType2 as UNF or No Basement. I think it is safe to eliminate this observation.

modified_data <- modified_data[-c(949),]
modified_data <- modified_data[-c(333),]

Poof! The two records are gone.

For the misisng electrical data, we will be removing it, because NA has no meaning behind it and there needs to be an option for it.

modified_data <- subset(modified_data, !is.na(modified_data$Electrical))

Poof! It’s gone!

For FireplaceQu, I looked ahead at the data dictionary, and it clearly states all NAs means no Fireplace, so we can attribute this to a better categorical variable: NoF

v = 1
while (v<1450){
  if (is.na(modified_data$FireplaceQu[v])){
    modified_data$FireplaceQu[v] = "NoF"
  }
  v = v + 1
}

And now again we have 5 variables that describe the same part of the house, the garage (GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond), and they have equal amounts of NA (81). And to no suprise, NA for each of those variables means No garage. We’ll chnage this to NoG instead.

v = 1
while (v<1450){
  if (is.na(modified_data$GarageType[v])){
    modified_data$GarageType[v] = "NoG"
  }
  if (is.na(modified_data$GarageYrBlt[v])){
    modified_data$GarageYrBlt[v] = "NoG"
  }
  if (is.na(modified_data$GarageFinish[v])){
    modified_data$GarageFinish[v] = "NoG"
  }
  if (is.na(modified_data$GarageQual[v])){
    modified_data$GarageQual[v] = "NoG"
  }
  if (is.na(modified_data$GarageCond[v])){
    modified_data$GarageCond[v] = "NoG"
  }
  v = v + 1
}

I found it that the next three attributes had a lot of NA entires. So I looked into the data dictionary, they all repersent the missing item for the attribute. They are not missing values, so will not be excluded, but given better names. NA for PoolQc will be chnaged to NoP, NA for Fence will be NoF, and NA for MiscFeature will become NoM.

v = 1
while (v<1450){
  if (is.na(modified_data$PoolQC[v])){
    modified_data$PoolQC[v] = "NoP"
  }
  if (is.na(modified_data$Fence[v])){
    modified_data$Fence[v] = "NoF"
  }
  if (is.na(modified_data$MiscFeature[v])){
    modified_data$MiscFeature[v] = "NoM"
  }
  v = v + 1
}

We should be done will addressing missing values, lets check!

x = 1
cat("---NULL COUNT---\n")
## ---NULL COUNT---
while (x<81){
  if(sum(is.na((modified_data[x]))>0)){
    cat("Number of nulls in ",(colnames(modified_data[x])), ": ")
    cat(sum(is.na(modified_data[x])), "\n")
  }
  x = x + 1 
}
outlier_sales <- subset(modified_data, modified_data$SalePrice > 340000)
outlier_sales[order(outlier_sales$SalePrice),]

1.1.3 Fixing Data types

modified_data$MSSubClass <- formatC(modified_data$MSSubClass)
modified_data$BsmtFullBath <- formatC(modified_data$BsmtFullBath)
modified_data$BsmtHalfBath <- formatC(modified_data$BsmtHalfBath)
modified_data$FullBath <- formatC(modified_data$FullBath)
modified_data$HalfBath <- formatC(modified_data$HalfBath)
modified_data$BedroomAbvGr <- formatC(modified_data$BedroomAbvGr)
modified_data$KitchenAbvGr <- formatC(modified_data$KitchenAbvGr)
modified_data$TotRmsAbvGrd <- formatC(modified_data$TotRmsAbvGrd)
modified_data$Fireplaces <- formatC(modified_data$Fireplaces)
modified_data$MoSold <- formatC(modified_data$MoSold)
modified_data$YrSold <- formatC(modified_data$YrSold)
modified_data$OverallQual <- formatC(modified_data$OverallQual)
modified_data$OverallCond <- formatC(modified_data$OverallCond)

1.1.4 Reducing the levels on for Categorical attributes and dealing with outlier for Quantitative attributes. Fixing distributions of categorical variables

# I will be using temp as an alt to the actual modified dataset until everything is finalized. 
colname <- colnames(modified_data)
temp = modified_data
#remeber which attributes to remove
toremove = as.vector(0)
outers= as.vector(0)
gg = 0
attrib = 0
global = 0

This will be a strainious process, but I will be going thru each variable. we shall see if there is a need to reduce the levels for the categorical ones, and if outliers need to be dealt with for the numerical ones.

attrib = attrib + 1
colname[attrib]
## [1] "Id"
if (typeof(temp[,attrib]) == 'integer'){
  graph = boxplot(temp[,attrib])
  print(length(graph$out))
  print(graph$out)
  outlierID = which(temp[,attrib]>graph$stats[5],)
  temp[outlierID,]
}

## [1] 0
## numeric(0)
if (typeof(temp[,attrib]) == 'character'){
  print(as.data.frame(table(temp[,attrib])))
  print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
  # global = global + 1
  # toremove[global] = attrib
}

Obviously there is no outlier problem with our ID variables. Nothing to be removed.

attrib = attrib + 1
colname[attrib]
## [1] "MSSubClass"
if (typeof(temp[,attrib]) == 'integer'){
graph = boxplot(temp[,attrib])
length(graph$out)
graph$out
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}

if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# Reassigning levels  
temp$MSSubClass <- factor(temp$MSSubClass)
levels(temp$MSSubClass) <- list(One_Story=c("20","30","40","45", "50"), Two_Story=c("60","70","75"), SplitDuplex=c("80","85","90"), PUD=c("120","160","180","190"))

print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

# global = global + 1
# toremove[global] = attrib
}
##    Var1 Freq
## 1   120   85
## 2   160   63
## 3   180   10
## 4   190   30
## 5    20  532
## 6    30   69
## 7    40    4
## 8    45   12
## 9    50  144
## 10   60  296
## 11   70   60
## 12   75   16
## 13   80   57
## 14   85   19
## 15   90   52

##          Var1 Freq
## 1   One_Story  761
## 2   Two_Story  372
## 3 SplitDuplex  128
## 4         PUD  188

Reduced the number of factors

attrib = attrib + 1
colname[attrib]
## [1] "MSZoning"
if (typeof(temp[,attrib]) == 'integer'){
graph = boxplot(temp[,attrib])
length(graph$out)
graph$out
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
##      Var1 Freq
## 1 C (all)   10
## 2      FV   62
## 3      RH   16
## 4      RL 1144
## 5      RM  217

I decided that the number of categorical factors are appropriate, and do not need to reduce. However Judging by the variance, Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.

attrib = attrib + 1
colname[attrib]
## [1] "LotFrontage"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1],)
temp[outlierID,]
outlier_lotFrontage = subset(temp, temp[,attrib]>graph$stats[5])
gg = gg+1
outers[gg] = attrib
}

## [1] 16
##  [1] 141 174 174 140 150 137 144 149 313 168 182 138 160 152 313 153
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


# global = global + 1
# toremove[global] = attrib
}

I flagged all the potential outliers into the var outlier. WIll not remove yet because i might just be removing this attribute all together.

attrib = attrib + 1
colname[attrib]
## [1] "LotArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1],)
temp[outlierID,]
outlier_lotArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 67
##  [1]  50271  19900  21000  21453  19378  31770  22950  25419 159000  19296
## [11]  39104  19138  18386 215245 164660  20431  18800  53107  34650  22420
## [21]  21750  70761  53227  40094  21872  21780  25095  46589  20896  18450
## [31]  21535  26178 115149  21695  53504  21384  28698  45600  17920  25286
## [41]  27650  24090  25000   1300  21286  21750  29959  18000  23257  17755
## [51]  35760  18030  35133  32463  18890  24682  23595  17871  36500  63887
## [61]  20781  25339  57200  20544  19690  21930  26142
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "Street"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1 Grvl    6
## 2 Pave 1443

Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.

attrib = attrib + 1
colname[attrib]
## [1] "Alley"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

 global = global + 1
 toremove[global] = attrib
}
##   Var1 Freq
## 1 Grvl   50
## 2 None 1359
## 3 Pave   40

Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.

attrib = attrib + 1
colname[attrib]
## [1] "LotShape"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# Reassigning levels  
temp$LotShape <- factor(temp$LotShape)
levels(temp$LotShape) <- list(IR=c("IR1","IR2","IR3"), Reg="Reg")

print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

# global = global + 1
# toremove[global] = attrib
}
##   Var1 Freq
## 1  IR1  482
## 2  IR2   41
## 3  IR3   10
## 4  Reg  916

##   Var1 Freq
## 1   IR  533
## 2  Reg  916

After looking at the distribution (variances) in each level and how closely related IR1,IR2,IR3 were to each other, I decided to group them. This will level the distribution a bit.

attrib = attrib + 1
colname[attrib]
## [1] "LandContour"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1  Bnk   63
## 2  HLS   50
## 3  Low   36
## 4  Lvl 1300

Will be dropped for sure.

attrib = attrib + 1
colname[attrib]
## [1] "Utilities"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

global = global + 1
toremove[global] = attrib
}
##     Var1 Freq
## 1 AllPub 1448
## 2 NoSeWa    1

Will be dropped for sure.

attrib = attrib + 1
colname[attrib]
## [1] "LotConfig"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##      Var1 Freq
## 1  Corner  262
## 2 CulDSac   93
## 3     FR2   47
## 4     FR3    3
## 5  Inside 1044

Will drop.

attrib = attrib + 1
colname[attrib]
## [1] "LandSlope"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1  Gtl 1371
## 2  Mod   65
## 3  Sev   13

Will drop

attrib = attrib + 1
colname[attrib]
## [1] "Neighborhood"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##       Var1 Freq
## 1  Blmngtn   17
## 2  Blueste    2
## 3   BrDale   16
## 4  BrkSide   58
## 5  ClearCr   28
## 6  CollgCr  148
## 7  Crawfor   50
## 8  Edwards  100
## 9  Gilbert   78
## 10  IDOTRR   37
## 11 MeadowV   17
## 12 Mitchel   49
## 13   NAmes  225
## 14 NoRidge   41
## 15 NPkVill    9
## 16 NridgHt   75
## 17  NWAmes   73
## 18 OldTown  113
## 19  Sawyer   74
## 20 SawyerW   58
## 21 Somerst   83
## 22 StoneBr   25
## 23   SWISU   25
## 24  Timber   37
## 25 Veenker   11

Will not touch this because i feel it will be very important.

attrib = attrib + 1
colname[attrib]
## [1] "Condition1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

global = global + 1
toremove[global] = attrib
}
##     Var1 Freq
## 1 Artery   48
## 2  Feedr   81
## 3   Norm 1249
## 4   PosA    8
## 5   PosN   19
## 6   RRAe   11
## 7   RRAn   26
## 8   RRNe    2
## 9   RRNn    5

Distribution is very bad, will drop

attrib = attrib + 1
colname[attrib]
## [1] "Condition2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##     Var1 Freq
## 1 Artery    2
## 2  Feedr    6
## 3   Norm 1434
## 4   PosA    1
## 5   PosN    2
## 6   RRAe    1
## 7   RRAn    1
## 8   RRNn    2

Even worse vairance, will drop.

attrib = attrib + 1
colname[attrib]
## [1] "BldgType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##     Var1 Freq
## 1   1Fam 1211
## 2 2fmCon   31
## 3 Duplex   52
## 4  Twnhs   43
## 5 TwnhsE  112

I like the idea of this attribute, unfortunatly the vaiance is not within my ruleset

attrib = attrib + 1
colname[attrib]
## [1] "HouseStyle"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##     Var1 Freq
## 1 1.5Fin  154
## 2 1.5Unf   14
## 3 1Story  720
## 4 2.5Fin    8
## 5 2.5Unf   11
## 6 2Story  442
## 7 SFoyer   36
## 8   SLvl   64

Although the distribution is not that good, we shall leave it as is because we want to perserve the information it gives. Perhaps later we will remove it.

attrib = attrib + 1
colname[attrib]
## [1] "OverallQual"
if (colname[attrib] == 'OverallQual'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##    Var1 Freq
## 1     1    2
## 2    10   17
## 3     2    3
## 4     3   20
## 5     4  116
## 6     5  395
## 7     6  372
## 8     7  314
## 9     8  167
## 10    9   43

if (typeof(temp[,attrib]) == 'integer' & colname[attrib] != 'OverallQual'){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##    Var1 Freq
## 1     1    2
## 2    10   17
## 3     2    3
## 4     3   20
## 5     4  116
## 6     5  395
## 7     6  372
## 8     7  314
## 9     8  167
## 10    9   43

There is no reason for us to be chnaging this. Changing to quality ranges would not help the distribution (ex 1-3, 4-7, 8-10). Will keep as is.

attrib = attrib + 1
colname[attrib]
## [1] "OverallCond"
if (colname[attrib] == 'OverallCond'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    1    1
## 2    2    5
## 3    3   24
## 4    4   57
## 5    5  813
## 6    6  251
## 7    7  205
## 8    8   71
## 9    9   22

if (typeof(temp[,attrib]) == 'integer' & colname[attrib] != 'OverallCond'){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    1    1
## 2    2    5
## 3    3   24
## 4    4   57
## 5    5  813
## 6    6  251
## 7    7  205
## 8    8   71
## 9    9   22

Again, the distribution does not look too good towards the extremes, but there is no way to fix this without removing information. Will keep

attrib = attrib + 1
colname[attrib]
## [1] "YearBuilt"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_yearbuilt = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 7
## [1] 1880 1880 1880 1882 1880 1875 1872
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "YearRemodAdd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}

## [1] 0
## numeric(0)
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "RoofStyle"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##      Var1 Freq
## 1    Flat   13
## 2   Gable 1131
## 3 Gambrel   11
## 4     Hip  285
## 5 Mansard    7
## 6    Shed    2

attrib = attrib + 1
colname[attrib]
## [1] "RoofMatl"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##      Var1 Freq
## 1 ClyTile    1
## 2 CompShg 1423
## 3 Membran    1
## 4   Metal    1
## 5    Roll    1
## 6 Tar&Grv   11
## 7 WdShake    5
## 8 WdShngl    6

will remove this this attrib

attrib = attrib + 1
colname[attrib]
## [1] "Exterior1st"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##       Var1 Freq
## 1  AsbShng   20
## 2  AsphShn    1
## 3  BrkComm    2
## 4  BrkFace   50
## 5   CBlock    1
## 6  CemntBd   59
## 7  HdBoard  222
## 8  ImStucc    1
## 9  MetalSd  220
## 10 Plywood  108
## 11   Stone    2
## 12  Stucco   25
## 13 VinylSd  508
## 14 Wd Sdng  205
## 15 WdShing   25

attrib = attrib + 1
colname[attrib]
## [1] "Exterior2nd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##       Var1 Freq
## 1  AsbShng   20
## 2  AsphShn    3
## 3  Brk Cmn    7
## 4  BrkFace   25
## 5   CBlock    1
## 6  CmentBd   58
## 7  HdBoard  206
## 8  ImStucc   10
## 9  MetalSd  214
## 10   Other    1
## 11 Plywood  142
## 12   Stone    4
## 13  Stucco   26
## 14 VinylSd  497
## 15 Wd Sdng  197
## 16 Wd Shng   38

attrib = attrib + 1
colname[attrib]
## [1] "MasVnrType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##      Var1 Freq
## 1  BrkCmn   15
## 2 BrkFace  445
## 3    None  862
## 4   Stone  127

attrib = attrib + 1
colname[attrib]
## [1] "MasVnrArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_masVnrArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 96
##  [1]  640  650  456 1031  573 1115  576  443  468  600  768  480 1129  436
## [15]  456  664  653  491  748  456  922  506  604  472  481 1600  616  870
## [29]  530  500  510  650  432  473  772  435  562  921  762  594  479  584
## [43]  420  459  452  513  472  660  528  464 1170  630  466  651  442  894
## [57]  513  673  603  860  424 1047  442  816  760  541  423  424  975  450
## [71]  423  571  480  425  660 1378  456  425  420  766  554  632  567  451
## [85]  621  788  796  428  564  579  705  731  420  448  426  438
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "ExterQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex   51
## 2   Fa   14
## 3   Gd  480
## 4   TA  904

attrib = attrib + 1
colname[attrib]
## [1] "ExterCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex    3
## 2   Fa   28
## 3   Gd  145
## 4   Po    1
## 5   TA 1272

attrib = attrib + 1
colname[attrib]
## [1] "Foundation"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##     Var1 Freq
## 1 BrkTil  146
## 2 CBlock  633
## 3  PConc  637
## 4   Slab   24
## 5  Stone    6
## 6   Wood    3

attrib = attrib + 1
colname[attrib]
## [1] "BsmtQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex  120
## 2   Fa   35
## 3   Gd  609
## 4  NoB   37
## 5   TA  648

attrib = attrib + 1
colname[attrib]
## [1] "BsmtCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1   Fa   45
## 2   Gd   64
## 3  NoB   37
## 4   Po    2
## 5   TA 1301

Might drop too

attrib = attrib + 1
colname[attrib]
## [1] "BsmtExposure"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1   Av  219
## 2   Gd  133
## 3   Mn  114
## 4   No  945
## 5  NoB   38

attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinType1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1  ALQ  220
## 2  BLQ  148
## 3  GLQ  411
## 4  LwQ   74
## 5  NoB   37
## 6  Rec  132
## 7  Unf  427

attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinSF1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtFinSF1 = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 8
## [1] 1810 1880 1904 1767 2260 2188 2096 5644
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinType2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1  ALQ   19
## 2  BLQ   33
## 3  GLQ   14
## 4  LwQ   46
## 5  NoB   38
## 6  Rec   54
## 7  Unf 1245

attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinSF2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtFinSF2 = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 167
##   [1]   32  668  486   93  491  506  712  362   41  169  869  150  670   28
##  [15] 1080  181  768  215  374  208  441  184  279  306  180  712  580  690
##  [29]  692  228  125 1063  620  175  820 1474  264  479  147  232  380  544
##  [43]  294  258  121  180  391  531  344  539  713  210  311 1120  165  532
##  [57]  279   96  495  180  174 1127  139  202  645  123  551  219  606  147
##  [71]  612  480  182  132  336  468  287   35  499  180  180  723  119  182
##  [85]   40  551  117  239   80  472   64 1057  127  630  480  128  377  764
##  [99]  345  539 1085  435  823  500  290  324  634  411  841 1061   93  466
## [113]  396  354  294  149  193  117  273  465  400  468   41  682   64  557
## [127]  230  106  791  240  287  547  391  469  177  108  374  600  492  211
## [141]  168   96 1031  438  375  144   81  906  608  276  661   68  173  972
## [155]  105  420  469  546  334  352  872  374  110  627  163 1029  290
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtUnfSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtUnfSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 29
##  [1] 1777 1768 1907 1686 2336 1694 2121 1869 2153 1969 1709 2042 1774 2046
## [15] 1836 1935 1926 1734 1800 1753 1905 1800 1710 1752 1694 1689 2002 1753
## [29] 1795
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "TotalBsmtSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_totalBsmtSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 60
##  [1]    0    0 2223    0    0    0 2216    0 2392    0 2121 2136 3206    0
## [15]    0    0    0 3094 2153 3200    0 3138    0    0    0    0 2109 2077
## [29] 2444    0    0    0    0 2078    0 2217    0    0 2330    0    0    0
## [43]    0 2524    0    0    0    0    0 2396 2158    0    0 2136    0 2110
## [57] 6110    0 2633    0
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "Heating"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##    Var1 Freq
## 1 Floor    1
## 2  GasA 1417
## 3  GasW   18
## 4  Grav    7
## 5  OthW    2
## 6  Wall    4

attrib = attrib + 1
colname[attrib]
## [1] "HeatingQC"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex  733
## 2   Fa   49
## 3   Gd  239
## 4   Po    1
## 5   TA  427

attrib = attrib + 1
colname[attrib]
## [1] "CentralAir"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1    N   95
## 2    Y 1354

attrib = attrib + 1
colname[attrib]
## [1] "Electrical"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##    Var1 Freq
## 1 FuseA   94
## 2 FuseF   27
## 3 FuseP    3
## 4   Mix    1
## 5 SBrkr 1324

attrib = attrib + 1
colname[attrib]
## [1] "X1stFlrSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_x1stFlrSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 19
##  [1] 2207 2223 2259 2158 2234 2392 2402 3228 3138 2444 2217 2364 2898 2524
## [15] 2411 2196 4692 2156 2633
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "X2ndFlrSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_x2stFlrSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 2
## [1] 1872 2065
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "LowQualFinSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_lowQualFinSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 26
##  [1] 360 513 234 528 572 144 392 371 390 420 473 156 515 360  80  80  53
## [18] 232 481 120 514 397 479 205  80 384
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GrLivArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_grLivArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 31
##  [1] 2945 3222 3608 3112 2794 3493 2978 3228 4676 2775 3194 3395 4316 3279
## [15] 3140 2822 2872 2898 3082 2868 2828 3627 3086 2872 4476 3447 5642 2810
## [29] 2792 3238 2784
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFullBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    0  852
## 2    1  581
## 3    2   15
## 4    3    1

attrib = attrib + 1
colname[attrib]
## [1] "BsmtHalfBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    0 1368
## 2    1   79
## 3    2    2

attrib = attrib + 1
colname[attrib]
## [1] "FullBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    0    9
## 2    1  648
## 3    2  760
## 4    3   32

attrib = attrib + 1
colname[attrib]
## [1] "HalfBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    0  908
## 2    1  529
## 3    2   12

attrib = attrib + 1
colname[attrib]
## [1] "BedroomAbvGr"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    0    6
## 2    1   49
## 3    2  354
## 4    3  800
## 5    4  211
## 6    5   21
## 7    6    7
## 8    8    1

attrib = attrib + 1
colname[attrib]
## [1] "KitchenAbvGr"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    0    1
## 2    1 1382
## 3    2   64
## 4    3    2

attrib = attrib + 1
colname[attrib]
## [1] "KitchenQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bedroomAbvGr = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex   99
## 2   Fa   39
## 3   Gd  578
## 4   TA  733

attrib = attrib + 1
colname[attrib]
## [1] "TotRmsAbvGrd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##    Var1 Freq
## 1    10   47
## 2    11   18
## 3    12   11
## 4    14    1
## 5     2    1
## 6     3   17
## 7     4   96
## 8     5  273
## 9     6  400
## 10    7  325
## 11    8  186
## 12    9   74

attrib = attrib + 1
colname[attrib]
## [1] "Functional"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1 Maj1   13
## 2 Maj2    5
## 3 Min1   31
## 4 Min2   34
## 5  Mod   15
## 6  Sev    1
## 7  Typ 1350

attrib = attrib + 1
colname[attrib]
## [1] "Fireplaces"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1    0  684
## 2    1  647
## 3    2  113
## 4    3    5

attrib = attrib + 1
colname[attrib]
## [1] "FireplaceQu"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex   24
## 2   Fa   33
## 3   Gd  377
## 4  NoF  684
## 5   Po   20
## 6   TA  311

attrib = attrib + 1
colname[attrib]
## [1] "GarageType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##      Var1 Freq
## 1  2Types    6
## 2  Attchd  862
## 3 Basment   19
## 4 BuiltIn   86
## 5 CarPort    9
## 6  Detchd  386
## 7     NoG   81

attrib = attrib + 1
colname[attrib]
## [1] "GarageYrBlt"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##    Var1 Freq
## 1  1900    1
## 2  1906    1
## 3  1908    1
## 4  1910    3
## 5  1914    2
## 6  1915    2
## 7  1916    5
## 8  1918    2
## 9  1920   14
## 10 1921    3
## 11 1922    5
## 12 1923    3
## 13 1924    3
## 14 1925   10
## 15 1926    6
## 16 1927    1
## 17 1928    4
## 18 1929    2
## 19 1930    8
## 20 1931    4
## 21 1932    3
## 22 1933    1
## 23 1934    2
## 24 1935    4
## 25 1936    5
## 26 1937    2
## 27 1938    3
## 28 1939    9
## 29 1940   14
## 30 1941   10
## 31 1942    2
## 32 1945    4
## 33 1946    4
## 34 1947    2
## 35 1948   11
## 36 1949    8
## 37 1950   24
## 38 1951    6
## 39 1952    3
## 40 1953   12
## 41 1954   19
## 42 1955   13
## 43 1956   16
## 44 1957   20
## 45 1958   21
## 46 1959   17
## 47 1960   19
## 48 1961   13
## 49 1962   21
## 50 1963   16
## 51 1964   18
## 52 1965   21
## 53 1966   21
## 54 1967   15
## 55 1968   26
## 56 1969   15
## 57 1970   20
## 58 1971   13
## 59 1972   14
## 60 1973   14
## 61 1974   17
## 62 1975    8
## 63 1976   29
## 64 1977   35
## 65 1978   19
## 66 1979   15
## 67 1980   15
## 68 1981   10
## 69 1982    4
## 70 1983    7
## 71 1984    8
## 72 1985   10
## 73 1986    6
## 74 1987   11
## 75 1988   14
## 76 1989   10
## 77 1990   16
## 78 1991    9
## 79 1992   13
## 80 1993   22
## 81 1994   18
## 82 1995   18
## 83 1996   20
## 84 1997   19
## 85 1998   31
## 86 1999   30
## 87 2000   27
## 88 2001   20
## 89 2002   24
## 90 2003   49
## 91 2004   52
## 92 2005   65
## 93 2006   58
## 94 2007   45
## 95 2008   29
## 96 2009   21
## 97 2010    3
## 98  NoG   81

I will not remove this because it shows a pattern.

attrib = attrib + 1
colname[attrib]
## [1] "GarageFinish"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1  Fin  346
## 2  NoG   81
## 3  RFn  417
## 4  Unf  605

attrib = attrib + 1
colname[attrib]
## [1] "GarageCars"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_garagecars = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 5
## [1] 4 4 4 4 4
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GarageArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_garageArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 20
##  [1] 1166  968 1053 1025 1390 1134  983 1020 1220 1248 1043 1052  995 1356
## [15] 1052  954 1014 1418  968 1069
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GarageQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex    3
## 2   Fa   48
## 3   Gd   14
## 4  NoG   81
## 5   Po    3
## 6   TA 1300

attrib = attrib + 1
colname[attrib]
## [1] "GarageCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex    2
## 2   Fa   35
## 3   Gd    9
## 4  NoG   81
## 5   Po    7
## 6   TA 1315

attrib = attrib + 1
colname[attrib]
## [1] "PavedDrive"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1    N   90
## 2    P   30
## 3    Y 1329

attrib = attrib + 1
colname[attrib]
## [1] "WoodDeckSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_woodDeck = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 32
##  [1] 857 576 476 574 441 468 670 495 536 519 466 517 426 503 486 486 511
## [18] 421 550 509 474 728 436 431 448 439 635 500 668 586 431 736
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "OpenPorchSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_openPorch = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 75
##  [1] 204 213 258 199 234 184 205 228 238 260 198 172 208 228 184 250 175
## [18] 195 214 231 192 187 176 523 285 406 182 502 274 172 243 235 312 267
## [35] 265 288 341 204 174 247 291 312 418 240 364 188 207 234 192 191 252
## [52] 189 282 224 319 244 185 200 180 263 304 234 240 192 198 287 292 207
## [69] 241 547 211 184 262 210 236
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "EnclosedPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}

## [1] 207
##   [1] 272 228 205 176 205  87 172 102  37 144  64 114 202 128 156  44  77
##  [18] 144 192 144 140 180 228 128 183  39 184  40 552  30 126  96  60 150
##  [35] 120 202  77 112 252  52 224 234 144 244 268 137  24 108 294 177 218
##  [52] 242  91 112 160 130 184 126 169 105  34  96 248 236 120  32  80 115
##  [69] 291 184 116 158 112 210  36 156 144  84 148 116 120 136 102 240  54
##  [86] 112  39 100  36 189 293 164  40 216 239 112 252 240 180  67  90 120
## [103]  56 112 129  40  98 143 216 234 112 112  70 386 154 185 156 156 134
## [120] 196 264 185 275  96 120 112 116 230 254  68 194 192  34 150 164 112
## [137] 224  32 318 244  48  94 138 108 112 226 192 174 228  19 170 220 128
## [154]  80 115 137 192 252 112  96 176 216 176 214 280  96 116 102 190 236
## [171] 192  84 330 208 145 259 126 264  81 164  42 123 162 100 286 190 168
## [188]  20 301 198  96 221 112 212  50 150 168 112 160 114 216 154  99 158
## [205] 216 252 112
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "X3SsnPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_X3Ss = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 24
##  [1] 320 407 130 180 168 180 140 508 238 245 196 144 144 182 168 162  23
## [18] 168 216  96 216 153 290 304
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "ScreenPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_screenPorch = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 116
##   [1] 176 198 291 252  99 184 168 130 142 192 410 224 266 170 154 153 144
##  [18] 142 128 259 160 198 271 234 184 374 192 185 182  90 144 224 396 170
##  [35] 176 140 276 192 180 161 168 145 200 122  95 144 120  60 120 126 189
##  [52] 260 147 385 287 200 156 100 180 216 210 197 204 192 225 192 152 175
##  [69] 126 312 222 265 224 322 120 190 233  63 147 180  53 143 189 189 189
##  [86] 192 160 160 126 100 273 180  90 288 263 224 147 120  80 163  90 288
## [103] 116 259 224 216 480 120 178 440 155 168 220 119 165  40
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "PoolArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_poolarea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 7
## [1] 512 648 576 555 480 519 738
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "PoolQC"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1   Ex    2
## 2   Fa    2
## 3   Gd    3
## 4  NoP 1442

Tooo big of a variance

attrib = attrib + 1
colname[attrib]
## [1] "Fence"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##    Var1 Freq
## 1 GdPrv   59
## 2  GdWo   54
## 3 MnPrv  156
## 4  MnWw   11
## 5   NoF 1169

attrib = attrib + 1
colname[attrib]
## [1] "MiscFeature"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


global = global + 1
toremove[global] = attrib
}
##   Var1 Freq
## 1 Gar2    2
## 2  NoM 1395
## 3 Othr    2
## 4 Shed   49
## 5 TenC    1

attrib = attrib + 1
colname[attrib]
## [1] "MiscVal"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_miscVal = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}

## [1] 52
##  [1]   700   350   700   500   400   700   480   400   400   450   450
## [12]   500   450   700   400 15500  1200   800   480   400  2000  2000
## [23]   600   500   600   600  3500   500   400   450   500  1300  1200
## [34]   500   400    54   500   400   400  2000   620   400   560   500
## [45]   700  1400   400  8300   600  1150  2000  2500
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))


#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "MoSold"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

#global = global + 1
#toremove[global] = attrib
}
##    Var1 Freq
## 1     1   58
## 2    10   89
## 3    11   78
## 4    12   58
## 5     2   52
## 6     3  104
## 7     4  140
## 8     5  201
## 9     6  253
## 10    7  233
## 11    8  121
## 12    9   62

attrib = attrib + 1
colname[attrib]
## [1] "YrSold"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

#global = global + 1
#toremove[global] = attrib
}
##   Var1 Freq
## 1 2006  313
## 2 2007  327
## 3 2008  299
## 4 2009  336
## 5 2010  174

attrib = attrib + 1
colname[attrib]
## [1] "SaleType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

global = global + 1
toremove[global] = attrib
}
##    Var1 Freq
## 1   COD   43
## 2   Con    2
## 3 ConLD    9
## 4 ConLI    5
## 5 ConLw    5
## 6   CWD    4
## 7   New  119
## 8   Oth    3
## 9    WD 1259

To drop because of variance

attrib = attrib + 1
colname[attrib]
## [1] "SaleCondition"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]

}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))

#global = global + 1
#toremove[global] = attrib
}
##      Var1 Freq
## 1 Abnorml  101
## 2 AdjLand    4
## 3  Alloca   11
## 4  Family   20
## 5  Normal 1191
## 6 Partial  122

Time to make some decisons.

print("The following attributes are under review to be removed:")
## [1] "The following attributes are under review to be removed:"
colname[toremove]
##  [1] "MSZoning"    "Street"      "Alley"       "LandContour" "Utilities"  
##  [6] "LotConfig"   "LandSlope"   "Condition1"  "Condition2"  "BldgType"   
## [11] "RoofMatl"    "BsmtCond"    "Heating"     "CentralAir"  "Electrical" 
## [16] "Functional"  "GarageQual"  "GarageCond"  "PavedDrive"  "PoolQC"     
## [21] "Fence"       "MiscFeature" "SaleType"
print("The following attributes have outliers that need to be addressed:")
## [1] "The following attributes have outliers that need to be addressed:"
colname[outers]
##  [1] "LotFrontage"  "LotArea"      "YearBuilt"    "MasVnrArea"  
##  [5] "BsmtFinSF1"   "BsmtFinSF2"   "BsmtUnfSF"    "TotalBsmtSF" 
##  [9] "X1stFlrSF"    "X2ndFlrSF"    "LowQualFinSF" "GrLivArea"   
## [13] "GarageCars"   "GarageArea"   "WoodDeckSF"   "OpenPorchSF" 
## [17] "X3SsnPorch"   "ScreenPorch"  "PoolArea"     "MiscVal"

1.1.5 Low variance Filter and correlation of our numeric attributes, outlier handling and Feature engineering

We will be working on checking the vairance of each variable. We want high variance.

#The ruleset I am emplaying is if there is one variable with over 65% of the observations or 2 with over 70%, I shall remove the attribute based of low variance.  
temp[toremove] <- NULL

We shall look at the matrix of the numeric attributes and remove any that have higher than 0.75 correlation with the dependent variable

#Subsetting for numeric only
nums <- unlist(lapply(temp, is.numeric))
numONLY = temp[,nums]
aa<-cor(numONLY)
ggcorrplot(aa)

#We are only intrested in the Sales price of the matrix
aa[,14]
##            Id   LotFrontage       LotArea     YearBuilt  YearRemodAdd 
##   0.008942765   0.223665282   0.260984687   0.200101716   0.290000837 
##    MasVnrArea    BsmtFinSF1    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF 
##   0.390639498   0.206730870  -0.009306382   0.240050955   0.453943400 
##     X1stFlrSF     X2ndFlrSF  LowQualFinSF     GrLivArea    GarageCars 
##   0.565663750   0.690263374   0.135276720   1.000000000   0.466809564 
##    GarageArea    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##   0.469419238   0.249199224   0.330309795   0.005335120   0.020858798 
##   ScreenPorch      PoolArea       MiscVal     SalePrice 
##   0.102209377   0.170843563  -0.002311848   0.710191940

There are some conclusions we can make.

#Might be better for us to remove 4 attributes and create a fuller 1 attribute
temp$HouseArea <- temp$TotalBsmtSF + temp$X1stFlrSF + temp$X2ndFlrSF

# Undo some work from before for this stage.
temp$FullBath <- as.numeric(temp$FullBath)
temp$BsmtFullBath <- as.numeric(temp$BsmtFullBath)
temp$HalfBath <- as.numeric(temp$HalfBath)
temp$BsmtHalfBath <- as.numeric(temp$BsmtHalfBath)

# Why keep so many bathroom attributes. Let us condense instead.
temp$TotalBath <- temp$FullBath + 0.5*(temp$HalfBath) + temp$BsmtFullBath + 0.5*as.numeric(temp$BsmtHalfBath)
# Same with porch
temp$TotalPorchSF <- temp$OpenPorchSF + temp$EnclosedPorch + temp$X3SsnPorch + temp$ScreenPorch + temp$WoodDeckSF
# Lets include the garage now
temp$TotalArea <-temp$HouseArea + temp$GarageArea
#We also know that total basement SF = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF. WHy not remove it 
#Another reason so remove total basement SF is because of its high correlation to 1stFlrSF
temp$TotalBsmtSF <- NULL
#Do not need house area, if we have total area
temp$HouseArea <- NULL

We will remove OverallQual and GrLiveArea due to its high correlation. (above our 0.7 threshold) GarageCars and Garagearea also have high correlation, and they themselves are highly correlated (0.88), so I shall remove one because they pretty much provide the same amount of information to the dependent variable. I pick Garagearea to remove because there are already a lot of area variables.

Also lets remove the attributes used in the addition, as well.

temp$OverallQual <- NULL
temp$GrLivArea <- NULL
temp$GarageArea <- NULL


temp$X1stFlrSF <-NULL
temp$X2ndFlrSF <- NULL
temp$FullBath <- NULL
temp$HalfBath <-NULL
temp$BsmtFullBath <- NULL
temp$BsmtHalfBath <- NULL

temp$OpenPorchSF <- NULL
temp$EnclosedPorch <- NULL
temp$X3SsnPorch <- NULL
temp$ScreenPorch <- NULL
temp$WoodDeckSF <- NULL
temp$PoolArea <- NULL

After our reduction, lets see our variables

colname_new <- colnames(temp)
colname_new
##  [1] "Id"            "MSSubClass"    "LotFrontage"   "LotArea"      
##  [5] "LotShape"      "Neighborhood"  "HouseStyle"    "OverallCond"  
##  [9] "YearBuilt"     "YearRemodAdd"  "RoofStyle"     "Exterior1st"  
## [13] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [17] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtExposure" 
## [21] "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2"  "BsmtFinSF2"   
## [25] "BsmtUnfSF"     "HeatingQC"     "LowQualFinSF"  "BedroomAbvGr" 
## [29] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Fireplaces"   
## [33] "FireplaceQu"   "GarageType"    "GarageYrBlt"   "GarageFinish" 
## [37] "GarageCars"    "MiscVal"       "MoSold"        "YrSold"       
## [41] "SaleCondition" "SalePrice"     "TotalBath"     "TotalPorchSF" 
## [45] "TotalArea"

We removed 25 variables so far, and added 3

#Only intrested in the attributes we added, but lets take a look
nums <- unlist(lapply(temp, is.numeric))
numONLY = temp[,nums]
aa<-cor(numONLY)
ggcorrplot(aa)

#We are only intrested in the Sales price of the matrix
aa[,15]
##           Id  LotFrontage      LotArea    YearBuilt YearRemodAdd 
## -0.030788859  0.039623156  0.185331915  0.097517499  0.180721296 
##   MasVnrArea   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF LowQualFinSF 
##  0.163072782  0.195781596  0.096143692  0.049431460  0.019967892 
##   GarageCars      MiscVal    SalePrice    TotalBath TotalPorchSF 
##  0.236456698  0.003100884  0.389503241  0.313955559  1.000000000 
##    TotalArea 
##  0.397873047
diag(var(numONLY))
##           Id  LotFrontage      LotArea    YearBuilt YearRemodAdd 
## 1.779660e+05 1.202679e+03 9.998646e+07 9.119621e+02 4.265806e+02 
##   MasVnrArea   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF LowQualFinSF 
## 3.283680e+04 2.075615e+05 2.620502e+04 1.956699e+05 2.381903e+03 
##   GarageCars      MiscVal    SalePrice    TotalBath TotalPorchSF 
## 5.609260e-01 2.479934e+05 6.297114e+09 6.144431e-01 2.461454e+04 
##    TotalArea 
## 9.187692e+05

As assumed, the areas will be correlated with one another, highly. Also the sales price is largely correlated with areas, bathrooms. I know that these areas are highly correlated but because we added so many variables into it, I will not be removign them.

Finally, we will look at addressing the outliers. I was holding off for as long as I could because they may potentially be eliminated with the removal of some of the attributes, but we might be at a point where no more attributes are to be taken out.

1.2 Bi variant analysis

Lets look for some patterns. A lot of it was discovered from the correlation table, but lets reillustrate some of it here. Truth be told, in our univariant work, we looked at some intresting combinations with the Sales price. I will re highlight these and look at some other combinations.

#Lets look at Lot
plot(temp$LotFrontage, temp$LotArea)

ggplot(data = temp, aes(y=temp$LotFrontage, x=temp$LotShape)) + geom_jitter(aes((temp$LotShape) ))

Nothing meaningful

plot(temp$TotalBath,temp$SalePrice)

plot(temp$TotalArea,temp$SalePrice)

plot(temp$GarageCars, temp$SalePrice)

plot(temp$YearBuilt, temp$SalePrice)

plot(temp$YearRemodAdd, temp$SalePrice)

#These attributes had the highest correlation with sales price. It is evident in these plot. 
#Something else we can see from the last prompts is that larger homes would have more bathrooms, and in turn higher prices. 
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$TotalBath)

#Larger homes would also have better quality fireplaces.
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$FireplaceQu)

#And better exterior quality
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$ExterQual)

#And better Basement quality
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$BsmtQual)

#QUality in general makes for a better house price
qplot(temp$ExterQual,temp$SalePrice, data = temp, colour= temp$BsmtQual)

# Very imporant comparisons, used to make decisions. Great to find patterns
ggplot(data = temp, aes(x=temp$ExterQual, y=temp$SalePrice)) + geom_count(aes( factor(temp$ExterQual) )) 

ggplot(data = temp, aes(x=temp$KitchenQual, y=temp$SalePrice)) + geom_count(aes(factor(temp$KitchenQual) )) 

ggplot(data = temp, aes(x=temp$BsmtQual, y=temp$SalePrice)) + geom_count(aes(factor(temp$BsmtQual) ))

ggplot(data = temp, aes(x=temp$FireplaceQu, y=temp$SalePrice)) + geom_count(aes(factor(temp$FireplaceQu) ))

#ggplot(data = temp, aes(x=temp$Fireplace, y=temp$SalePrice)) + geom_count(aes((temp$Fireplace) ))
qplot(temp$Fireplaces,temp$SalePrice, data = temp)

ggplot(data = temp, aes(x=temp$MiscVal, y=temp$SalePrice)) + geom_count(aes((temp$MiscVal) ))

ggplot(data = temp, aes(x=temp$BsmtFinSF1, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinSF1) ))

ggplot(data = temp, aes(x=temp$BsmtFinSF2, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinSF2) ))

ggplot(data = temp, aes(x=temp$BsmtFinType2, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinType2) )) #This got through the cracks, it should have been removed.

ggplot(data = temp, aes(x=temp$MasVnrArea, y=temp$SalePrice)) + geom_count(aes((temp$MasVnrArea) ))

ggplot(data = temp, aes(x=temp$MasVnrType, y=temp$SalePrice)) + geom_count(aes((temp$MasVnrType) ))

ggplot(data = temp, aes(x=temp$BsmtUnfSF, y=temp$SalePrice)) + geom_count(aes((temp$BsmtUnfSF) ))

mean(temp$BsmtUnfSF)
## [1] 567.4651
nrow(temp[temp$BsmtUnfSF<200,])#I decide to keep this, attribute
## [1] 335
ggplot(data = temp, aes(x=temp$LowQualFinSF, y=temp$SalePrice)) + geom_count(aes((temp$LowQualFinSF) ))

ggplot(data = temp, aes(x=temp$GarageCars, y=temp$SalePrice)) + geom_count(aes((temp$GarageCars) ))

ggplot(data = temp, aes(x=temp$LotFrontage, y=temp$SalePrice)) + geom_count(aes((temp$LotFrontage) ))

nrow(temp[temp$LotFrontage<10,])#I decide to keep this, attribute
## [1] 257
ggplot(data = temp, aes(x=temp$tot, y=temp$SalePrice)) + geom_count(aes((temp$LotFrontage) ))

mean(temp$PoolArea)
## Warning in mean.default(temp$PoolArea): argument is not numeric or logical:
## returning NA
## [1] NA
mean(temp$MiscVal)
## [1] 43.81919
mean(as.numeric(temp$Fireplaces))
## [1] 0.6128364
#After observing the graphics and looking into variances and mean, these need to be removed. The means alone tell a story of how skewed of a picture these give. Because of a lack of normalization, means can show a picture as well. 
temp$Fireplace <- NULL 
temp$PoolArea <- NULL 
temp$MiscVal <- NULL 
temp$BsmtFinSF2 <- NULL
temp$MasVnrArea <- NULL
temp$LowQualFinSF <-NULL

1.3 Remove those outlier!

I saved this for as late as possible. I wanted to see if most of the attributes will be eliminated before we remove observations due to outliers.

#bsmtfinsf1
flattened_outlier = unlist(outlier_bsmtFinSF1[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed bsmtfinsf2
outlier_bsmtFinSF2 = 0
#removed masvnrarea
outlier_masVnrArea = 0
#bsmtUnfSF
flattened_outlier = unlist(outlier_bsmtUnfSF[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed garage area
outlier_garageArea = 0
#removed garage car
flattened_outlier = unlist(outlier_garagecars[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed grlivarea
outlier_grLivArea = 0
#lotarea
flattened_outlier = unlist(outlier_lotArea[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#lotfrontage
flattened_outlier = unlist(outlier_lotFrontage[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed lowqualfin
outlier_lowQualFinSF = 0
#removed misc
outlier_miscVal = 0
#removed OpenPorch
outlier_openPorch = 0
#removed pool
outlier_poolarea = 0
#removed screen porch 
outlier_screenPorch= 0
#removed total Basement SF 
outlier_totalBsmtSF= 0
#removed wood deck 
outlier_woodDeck= 0
#removed 1st flr sf 
outlier_x1stFlrSF= 0
#removed 2nd flr sf 
outlier_x2stFlrSF= 0
#removed 3Ss
outlier_X3Ss= 0

print("it is a good idea we waited to remove the outliers, otherwise we would have lost a lot of observations only to remove the attributes later")
## [1] "it is a good idea we waited to remove the outliers, otherwise we would have lost a lot of observations only to remove the attributes later"
#I will allow this outliers:
#outlier_yearbuilt 
#after all our intial analysis, we can set the data back to our original name
modified_data = temp
modified_data = subset(modified_data, select=-c(SalePrice))
modified_data = subset(modified_data, select=-c(YearRemodAdd))
modified_data$SalePrice = temp$SalePrice
modified_data <- droplevels(modified_data)

2 - EDA

2.1 Normalization

#we do not need an id anymore
modified_data = subset(modified_data, select=-c(Id))
# Our Normalizing technique
normalize <- function(x) {
  if (is.numeric(x)){
  return ((x - min(x)) / (max(x) - min(x))) }
  else{
    return (x)
  }
}
data_norm = as.data.frame(lapply(modified_data[1:38], normalize))
data_norm <- cbind(modified_data$SalePrice, data_norm)
colnames(data_norm)[colnames(data_norm)=="modified_data$SalePrice"] <- "SalePrice"
data_norm_noout <- subset(data_norm, data_norm$SalePrice < boxplot(data_norm$SalePrice)$stats[5,])

2.2 Clustering

numsonly <- unlist(lapply(modified_data, is.numeric))
numarray = modified_data[,numsonly]
#numarray = subset(numarray, select = -c(Id))
fit = kmeans(numarray,4)
plotcluster(numarray,fit$cluster)

#str(fit)

fit = kmodes(numarray, 4)
## Warning in kmodes(numarray, 4): data has numeric coloumns with more than 30
## different levels!
plotcluster(numarray,fit$cluster)

# It is not meaningful to have clustering for categorical variables. I did only numerical values.

In fact, these clusters are not meaningful for the numerical only attributes, either! It would have been a better tool to be used in EDA of 2-5 variables perhaps, but I will do without it.

We are now into our Deisgn and Testing stage!!!

3 - Experimental Design and Modeling

3.0 Training and test set Split

set.seed(11)
#Splitting training to 80%, test to 20%
index <- sample(1:nrow(data_norm_noout), 0.80 *nrow(data_norm_noout))
data_train <- data_norm[index,]
data_test <- data_norm[-index,]

data_train_noout <- data_norm_noout[index,]
data_test_noout <- data_norm_noout[-index,]

rf_data = modified_data
rf_data$GarageYrBlt = unlist(lapply(modified_data$GarageYrBlt, as.numeric), use.name = FALSE )
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion

## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
rf_data = subset(rf_data, select= -c(GarageYrBlt))

rdata_norm = as.data.frame(lapply(rf_data[1:38], normalize))
rdata_norm <- cbind(modified_data$SalePrice, rdata_norm)
colnames(rdata_norm)[colnames(rdata_norm)=="modified_data$SalePrice"] <- "SalePrice"
# training to 80%, test to 20%
index <- sample(1:nrow(rdata_norm), 0.80 *nrow(rdata_norm))
rdata_train <- rdata_norm[index,]
rdata_test <- rdata_norm[-index,]

rdata_norm_noout <- subset(rdata_norm, rdata_norm$SalePrice < boxplot(rdata_norm$SalePrice)$stats[5,])

# training to 80%, test to 20%
index <- sample(1:nrow(rdata_norm_noout), 0.80 *nrow(rdata_norm_noout))
rdata_train_noout <- rdata_norm_noout[index,]
rdata_test_noout <- rdata_norm_noout[-index,]

MAE <- function(actual, predicted){
   mean(abs(actual- predicted))
 }

RMSE <- function(actual, predicted){
    sqrt(mean((predicted-actual)^2))
  }

3.1 KNN

#Our label is the Sales price, in col 1
# trainlabel <- data_train[,1]
# testlabel <- data_test[,1]

#Applying KNN
##test_pred <- knn(train = data_train[,2:39], test = data_test[,2:39],cl = data_train[,1], k=9)
#Creating accuracy matrix
##CrossTable(x=testlabel, y=test_pred, prop.chisq=FALSE) # This makes no sense for non-class prediction (i.e Regression)

Learned that KNN might not be a good suit for someone with categorical data within the DF. We shall need to look at regression. shi#3.2.1 Decision Tree 1.

set.seed(11)
tree_model <- tree(rdata_train$SalePrice ~ . , data = rdata_train)
plot(tree_model)
text(tree_model)

tree_pred = predict(tree_model, rdata_test)

MAE(rdata_test$SalePrice, tree_pred)
## [1] 23439.04
RMSE(rdata_test$SalePrice, tree_pred)
## [1] 33673.1
cv_tree = cv.tree(tree_model)
names(cv_tree)
## [1] "size"   "dev"    "k"      "method"
plot (cv_tree$size, cv_tree$dev, type = "b", xlab = "Tree Size", ylab = "MSE")

cv_tree$size[which.min(cv_tree$dev)]
## [1] 10
print("No need to prune, we are using size 9 tree")
## [1] "No need to prune, we are using size 9 tree"

3.2.2 Decision Tree 2.

set.seed(11)

m1 <- rpart(data_train$SalePrice ~ ., data = data_train, method = "anova")
m1
## n= 1024 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1024 4.722550e+12 173891.20  
##    2) TotalArea< 0.5482043 778 1.239122e+12 146996.20  
##      4) TotalArea< 0.389993 365 2.751843e+11 119109.70  
##        8) TotalArea< 0.2779812 87 3.744565e+10  91489.78 *
##        9) TotalArea>=0.2779812 278 1.506000e+11 127753.30 *
##      5) TotalArea>=0.389993 413 4.292363e+11 171641.70  
##       10) Neighborhood=Blueste,BrkSide,Edwards,IDOTRR,MeadowV,Mitchel,NAmes,NPkVill,NWAmes,OldTown,Sawyer,SWISU 219 1.256293e+11 152961.30 *
##       11) Neighborhood=Blmngtn,ClearCr,CollgCr,Crawfor,Gilbert,NoRidge,NridgHt,SawyerW,Somerst,StoneBr,Timber,Veenker 194 1.409153e+11 192729.30 *
##    3) TotalArea>=0.5482043 246 1.140887e+12 258949.30  
##      6) TotalArea< 0.6956939 188 5.297778e+11 235240.90  
##       12) GarageYrBlt=1920,1945,1948,1949,1955,1956,1960,1963,1968,1969,1970,1973,1976,1977,1980,1981,1984,1985,1986,1987,1992,NoG 40 7.571870e+10 175180.10 *
##       13) GarageYrBlt=1926,1932,1935,1950,1953,1957,1959,1961,1974,1983,1988,1989,1990,1991,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009 148 2.707692e+11 251473.60  
##         26) BsmtQual=Gd,TA 120 1.216738e+11 239645.30 *
##         27) BsmtQual=Ex 28 6.035381e+10 302166.20 *
##      7) TotalArea>=0.6956939 58 1.629136e+11 335797.10  
##       14) GarageYrBlt=1959,1968,1976,1977,1981,1988,1990,1992,1993,1994,1995,1996,1997,1998,1999,2000,2003,2004,2007 42 6.844690e+10 315623.00 *
##       15) GarageYrBlt=1934,1982,2001,2005,2006,2008,2009,2010 16 3.250177e+10 388754.10 *
regress_plot <- rpart.plot(m1, type=2, digits=3, fallen.leaves = TRUE)

p1 <- predict(m1, data_test)

# Two error checking methods.
MAE(data_test$SalePrice, p1)
## [1] 23574.07
RMSE(data_test$SalePrice, p1)
## [1] 33262.01

3.3 Random Forest

set.seed(11)
model_RF<- randomForest(rdata_train$SalePrice ~ ., data = rdata_train, proximity=TRUE)
model_RF
## 
## Call:
##  randomForest(formula = rdata_train$SalePrice ~ ., data = rdata_train,      proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 12
## 
##           Mean of squared residuals: 527615323
##                     % Var explained: 88.29
rf_predict = predict(model_RF,rdata_test)

RMSE(rdata_test$SalePrice,rf_predict)
## [1] 22934.38
MAE(rdata_test$SalePrice,rf_predict)
## [1] 14421.09
# Increasing the tree coutn doesnt significantly decrease the errors

3.4.1 Regression

## ## PROBLEM HERE BECAUSE THERE ARE CASES WHERE FACTORS IN TEST ARE NOT PRESENT IN TRAINING
set.seed(11)
rregressive_model <- lm(rdata_train$SalePrice ~ ., data = rdata_train)
plot(rregressive_model)
## Warning: not plotting observations with leverage one:
##   51, 158, 184, 253, 572, 592, 611, 652, 762, 822, 1028

## Warning: not plotting observations with leverage one:
##   51, 158, 184, 253, 572, 592, 611, 652, 762, 822, 1028

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

# Residual vs Fitted: No real pattern between Residuals and Fitted values. THis is good. Residuals arefactors our LR did not consider. We dont want those to have 
# patterns. However, the increasing slop after the 250 000 mark is concerning.
# QQ-plot: Not completly m=1 slope: not perfectly linear. Also the tails suggest that our model is light tailed, Still could be considered linear
# Scale_Location: Here we clearly see the most of the data follow an non-linear form. Increasing 
# FInally the Residual vs Leverage plot tells me there is not a highly influensial oberservation overly skewing our predictor. WHich is good

3.4.2 Ridge, Lasso and elastic net regression

set.seed(11)
#both reduce vairance
#if you know your variables all very usefuls, use ridge
# Ridge alpha = 0.  you are desensitizing your model to the training data. Good to combat overfitting issue of best least squares line
alpha0.fit <- cv.glmnet(x = data.matrix(data_train[,2:39]), y = data_train[,1], type.measure = "mse", alpha = 0, family= "gaussian")
alpha0.predicted <- predict(alpha0.fit, s=alpha0.fit$lambda.1se, newx= data.matrix(data_test[,2:39]))
# Lasso alpha = 1. similar equations, BUTT helps remove useless variables
alpha1.fit <- cv.glmnet(x = data.matrix(data_train[,2:39]), y = data_train[,1], type.measure = "mse", alpha = 1, family= "gaussian")
alpha1.predicted <- predict(alpha1.fit, s=alpha1.fit$lambda.1se, newx= data.matrix(data_test[,2:39]))

MAE(data_test[,1],alpha0.predicted)
## [1] 18438.22
RMSE(data_test[,1],alpha0.predicted)
## [1] 25375.74
MAE(data_test[,1],alpha1.predicted)
## [1] 17607.39
RMSE(data_test[,1],alpha1.predicted)
## [1] 24931.68
# So we can see the differences in error between the weights of Ridge and Lasso
# We will create a Loop to find the best alpha value!
list.of.fits <- list()
for (i in 0:10) {
  fit.name <- paste0("alpha", i/10) #create name alpha with the alpha value as loop increments
  list.of.fits[[fit.name]] <- cv.glmnet(x = data.matrix(data_train[,2:39]), y = data_train[,1], type.measure="mae", alpha=i/10, family="gaussian")
}
results <- data.frame() # Database for the resulting error matrix
for (i in 0:10) {
  fit.name <- paste0("alpha", i/10)
  ## Use each model to predict 'y' given the Testing dataset
  predicted <- predict(list.of.fits[[fit.name]], s=list.of.fits[[fit.name]]$lambda.1se, newx=data.matrix(data_test[,2:39]))
  ## Error
  RMSE = sqrt(mean((predicted-data_test[,1])^2))
  
  ## Results 
  temp <- data.frame(alpha=i/10, RMSE=RMSE, fit.name=fit.name)
  results <- rbind(results, temp)
}
results
# We favour more of the lasso technique with an alpha of 0.6

# Lasso alpha = 1. similar equations, BUTT helps remove useless variables
ENR <- cv.glmnet(x = data.matrix(data_train[,2:39]), y = data_train[,1], type.measure = "mse", alpha = 0.6, family= "gaussian")
ENR.predicted <- predict(ENR, s=ENR$lambda.1se, newx= data.matrix(data_test[,2:39]))

4 - Evaluating Models

4.0 Comparing models

MAE <- function(actual, predicted){
   mean(abs(actual- predicted))
 }

RMSE <- function(actual, predicted){
    sqrt(mean((predicted-actual)^2))
  }
print("For Regression Models RMSE and MAE are commonly used when comparing regression models")
## [1] "For Regression Models RMSE and MAE are commonly used when comparing regression models"
temp_DT1 <- data.frame(Model="Decision Tree 1", RMSE=RMSE(rdata_test$SalePrice, tree_pred), MAE= MAE(rdata_test$SalePrice, tree_pred))
temp_DT2 <- data.frame(Model="Decision Tree 2", RMSE=RMSE(data_test$SalePrice, p1), MAE= MAE(data_test$SalePrice, p1))
temp_RF <- data.frame(Model="Random Forest", RMSE=RMSE(rdata_test$SalePrice,rf_predict), MAE= MAE(rdata_test$SalePrice,rf_predict))
temp_ENReg <- data.frame(Model="Elastic Net Regression", RMSE=RMSE(data_test$SalePrice,ENR.predicted), MAE= MAE(data_test$SalePrice,ENR.predicted))
err_results <- rbind(temp_DT1, temp_DT2, temp_RF, temp_ENReg)
err_results

4.1 Lets remove those outliers and re run models!

MAE <- function(actual, predicted){
   mean(abs(actual- predicted))
 }

RMSE <- function(actual, predicted){
    sqrt(mean((predicted-actual)^2))
  }
noout_DT1 <- tree(rdata_train_noout$SalePrice ~ . , data = rdata_train_noout)
DT1_pred <- predict(noout_DT1, rdata_test_noout)

# Removed 2nd type of DT because we saw that it performed worse than the other DT.

noout_RF<- randomForest(rdata_train_noout$SalePrice ~ ., data = rdata_train_noout, proximity=TRUE)
rf_pred = predict(noout_RF,rdata_test_noout)

noout_ENR <- cv.glmnet(x = data.matrix(data_train_noout[,2:39]), y = data_train_noout[,1], type.measure = "mse", alpha = 0.6, family= "gaussian")
END_pred <- predict(noout_ENR, s=noout_ENR$lambda.1se, newx= data.matrix(data_test_noout[,2:39]))


DT1 <- data.frame(Model="Decision Tree", RMSE=RMSE(rdata_test_noout$SalePrice, DT1_pred), MAE= MAE(rdata_test_noout$SalePrice, DT1_pred))
RF <- data.frame(Model="Random Forest", RMSE=RMSE(rdata_test_noout$SalePrice,rf_pred), MAE= MAE(rdata_test_noout$SalePrice,rf_pred))
ENReg <- data.frame(Model="Elastic Net Regression", RMSE=RMSE(data_test_noout$SalePrice,END_pred), MAE= MAE(data_test_noout$SalePrice,END_pred))
error_results <- rbind(DT1,RF,ENReg)
error_results
#Looking into the new data without outliers
outregressive_model <- lm(rdata_train_noout$SalePrice ~ ., data = rdata_train_noout)
plot(outregressive_model)
## Warning: not plotting observations with leverage one:
##   107, 146, 236, 283, 343, 406, 416, 450, 484, 608, 656, 705, 707, 751, 780, 876, 971, 997

## Warning: not plotting observations with leverage one:
##   107, 146, 236, 283, 343, 406, 416, 450, 484, 608, 656, 705, 707, 751, 780, 876, 971, 997

# Residual vs Fitted: There is a pattern between Residuals and Fitted values. THis is not good. this should be random. BAD
# QQ-plot: We wanted this to be 
# Scale_Location: Here we clearly see the most of the data follow an non-linear form. Increasing 
# FInally the Residual vs Leverage plot tells me there is not a highly influensial oberservation overly skewing our predictor. WHich is good

It is evident that removing those outliers definetly helped imporve our Regression Tree and Regression Forest, however our Elastic net regression did not chnage too much.